To run this script for the ABE data: Rscript run_analysis_individual.R 2 markers_ABE sce_ABE.rds analysis_ABE.html
To run this script for the CBE data: Rscript run_analysis_individual.R 1 markers_CBE sce_CBE.rds analysis_CBE.html
run_analysis_individual.R:
params=commandArgs(trailingOnly=TRUE) rmarkdown::render(“analysis_individual.Rmd”,output_file=params[4],params=list(dataset_ind=params[1],save_file_name_markers=params[2],sce_save_file=params[3]))
knitr::opts_chunk$set(warning=FALSE,message=FALSE)
library(bluster)
source("../../core_functions.R")
library("batchelor")
library("scater")
library(scran)
library(dplyr)
library(igraph)
library(RColorBrewer)
library(pheatmap)
library(Seurat)
library(destiny)
library(edgeR)
library(MASS)
library(energy)
library(ggthemes)
library(progeny)
library(ggridges)
library(stringr)
library(ggrepel)
library(mclust)
set.seed(4444)
# knitr::opts_chunk$set(
# echo = FALSE,
# message = FALSE,
# warning = FALSE,
# message = FALSE,
# dev = c("pdf"),
# dpi=300
# )
#
#params=c(2 ,"markers_ABE", "sce_ABE.rds", "analysis_ABE.html")
dataset_ind <- as.double(params[1])
sce_list_file <- "sce_list_gRNA_iBAR.rds"
save_file_name_markers <- params[2]
sce_save_file <- params[3]
folder_cellranger <- "../cellranger/cellranger700_count_46539_7105STDY13259924_GRCh38-2020-A"
yy <- "CBE"
if (dataset_ind==2){yy <- "ABE"}
cell_cycle_frequency_save_file <- paste0("cell_cycle_",yy,".csv")
cell_cycle_frequency_save_file2 <- paste0("cell_cycle_target_gene",yy,".csv")
proliferation_file <- paste0("proliferation",yy,".csv")
markers_file_all <- paste0("markers",yy,"_all.csv")
markers_file <- paste0("markers",yy,".csv")
markers_file_progeny <- paste0("markers",yy,"_progeny.csv")
markers_file_MAYA_hallmark <- paste0("markers",yy,"_MAYA_hallmark.csv")
markers_file_BCL2 <- paste0("markers",yy,"_BCL2.csv")
file_energy_distance <- paste0("ed_",yy,".rds")
colourscheme_variant_class <- c("control"="lightgrey","canonical drug resistance"="darkblue","driver"="orange","drug addiction"="darkred")
We read in the SingleCellExperiment object, for which QC, gRNA and iBAR calling had been performed previously, and add meta-data from the pooled experiments.
sce_list <- readRDS(sce_list_file)
sce <- sce_list[[dataset_ind]]
features <- read.delim(paste0(folder_cellranger,"/filtered_feature_bc_matrix/features.tsv.gz"),header=FALSE,stringsAsFactors = FALSE)
rowData(sce) <- features[match(rownames(sce),features$V1),]
validation_lib <- read.table("../../sc_HT29_debcet_annotated_270623.csv",sep=",",header=TRUE)
#replace the line above using supplementary table 1 from our preprint
if(yy=="ABE"){
feature_ref <- read.table("feature_ref_gRNA_ABE.csv",sep=",",header=TRUE)
}else{
feature_ref <- read.table("feature_ref_gRNA_CBE.csv",sep=",",header=TRUE)
}
if (yy=="CBE"){
validation_lib <- validation_lib[validation_lib$editor=="CBE" & validation_lib$sgRNA_ID%in%feature_ref$id,]
}else{
validation_lib <- validation_lib[validation_lib$editor=="ABE" & validation_lib$sgRNA_ID%in%feature_ref$id,]
}
rename_gRNAs <- function(x){
v <- strsplit(x,"-")[[1]]
w <- validation_lib$ID[match(v,validation_lib$sgRNA_ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
meta_lfc <- function(x)
{
v <- strsplit(x,"-")[[1]]
w <- validation_lib$L2FC_HT29_DebCet_plasmid_1[match(v,validation_lib$sgRNA_ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
meta_class <- function(x)
{
v <- strsplit(x,"-")[[1]]
w <- validation_lib$variant_class[match(v,validation_lib$ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
meta_resistance <- function(x)
{
v <- strsplit(x,"-")[[1]]
w <- validation_lib$resistance_mutation[match(v,validation_lib$ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
meta_target <- function(x)
{
v <- strsplit(x,"-")[[1]]
w <- validation_lib$Gene[match(v,validation_lib$ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
meta_type <- function(x)
{
v <- strsplit(x,"-")[[1]]
w <- validation_lib$sgRNA_type[match(v,validation_lib$ID)]
if (length(w) > 1){
x <- w
w <- x[1]
if (length(x)>1){
for (j in 2:length(x))
w <- paste0(w,"-",x[j])
}
}
return(w)
}
sce$gRNA <- sapply(sce$gRNA,rename_gRNAs)
sce$l2fc <- sapply(sce$gRNA,meta_lfc)
sce$target_gene <- sapply(sce$gRNA,meta_target)
sce$variant_class <- sapply(sce$gRNA,meta_class)
sce$resistance_mutation <- sapply(sce$gRNA,meta_resistance)
sce$gRNA_type <- sapply(sce$gRNA,meta_type)
We plot the number of gRNAs called in a cell that passed QC. If there are several gRNAs in a cell, then each of them is counted. The figure below shows that cells with gRNAs targeting splice-essential sites proliferate much less than the other groups. The gRNAs targeting splice-essential sites were added as a positive control to check their depletion, and will be removed from downstream analysis.
nr_gRNA <- table(factor(unlist(lapply(sce$gRNA,function(x) strsplit(x,"-")[[1]])),levels=validation_lib$ID))
validation_lib$nr_gRNA <- nr_gRNA[match(validation_lib$ID,names(nr_gRNA))]
ggplot(validation_lib,mapping=aes(x=nr_gRNA,color=sgRNA_type)) +geom_density(size=1.5)+scale_color_colorblind()+theme_classic(base_size=24)+xlab("number of cells")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))
sce <- sce[,!(grepl("splice_essential",sce$gRNA_type))]
After removing the splice essential gRNAs, we are left with the following number of cells: 12327.
The following is a histogram of number of cells per gRNA, with the splice essential gRNAs excluded.
nr_gRNA <- table(unlist(lapply(sce$gRNA,function(x) strsplit(x,"-")[[1]])))
ggplot(mapping=aes(x=nr_gRNA)) + geom_histogram()+scale_x_continuous(trans='log10') + theme_bw()+xlab("number of cells per gRNA") + ylab("number of gRNAs")
The following figure is a histogram of the number of different iBARs per gRNA.
sce$gRNA_iBAR <- paste0(sce$gRNA,"-",sce$iBAR)
ggplot(mapping=aes(x=table(sce$gRNA_iBAR))) + geom_histogram()+scale_x_continuous() + theme_bw()+xlab("number of cells per iBAR") + ylab("number of iBARs")
Mean number of cells per iBAR-group: 1.5086281.
We combine cells with the same gRNA-iBAR combination by averaging their gene expression. We use the iBAR-gRNA combinations rather than iBARs. As the iBAR is only 6nt long, the identical iBAR sequence may have been inserted in several cells, while in combination with the gRNA, the iBAR creates a unique identifier for a small group of cells with the same genoytpe. This solves the problem of biases resulting from unequal numbers of cells originating from different cells and having potentially different edits. We call a group of cells with the same iBAR-gRNA combination, and therefore the same genotype, an iBAR-group.
Counts <- as.matrix(counts(sce))
sce_counts_iBAR <- sapply(1:length(unique(sce$gRNA_iBAR)),function(x) return(rowSums(Counts[,sce$gRNA_iBAR==unique(sce$gRNA_iBAR)[x],drop=FALSE])))
rownames(sce_counts_iBAR) <- rownames(sce)
colnames(sce_counts_iBAR) <- unique(sce$gRNA_iBAR)
sce_iBAR <- SingleCellExperiment(assays=list(counts=sce_counts_iBAR),rowData=rowData(sce))
cpm_iBAR <- cpm(counts(sce_iBAR),log=TRUE)
sce_iBAR$gRNA <- sce$gRNA[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$l2fc <- sce$l2fc[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$target_gene <- sce$target_gene[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$variant_class <- sce$variant_class[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$resistance_mutation <- sce$resistance_mutation[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR <- logNormCounts(sce_iBAR)
#saveRDS(sce_iBAR,file=sce_save_file)
Number of different iBAR-groups: 8171
Computing the average number of iBAR-groups per gRNA using also cells with multiple gRNAs:
nr_gRNAs_sce_iBAR <- table(unlist(lapply(sce_iBAR$gRNA,function(x) strsplit(x,"-")[[1]])))
mean(nr_gRNAs_sce_iBAR)
## [1] 44.54202
Average number of iBAR-groups per gRNA (using also cells with multiple gRNAs): 44.5420168.
First, we identify all those gRNAs associated with drug resistance, drug addiction or driver mutations together with potential high impact gRNAs targeting BCL2 and EGFR that are the only gRNA for at least 10 iBAR-groups. Now we perform DE analysis for those gRNAs.
gRNAs <- names(table(sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))]))[table(sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))])>=10]
gRNAs_NT <- gRNAs[grepl("non_targeting",gRNAs)]
gRNAs <- setdiff(gRNAs,gRNAs_NT)
gRNAs_resistance_driver <- intersect(gRNAs,validation_lib$ID[validation_lib$variant_class!="control"])
gRNAs_test <-intersect(gRNAs,c(validation_lib$ID[validation_lib$variant_class!="control"],c("BCL2_1e8877", "EGFR_156dcf", "EGFR_53b63e")))
More precisely, the following is DE analysis between iBAR-groups with unique non-targeting gRNAs and iBAR groups containing gRNAs from the set of gRNAs identified above.
markers <- list()
sce_tempA <- sce_iBAR
sce_tempA$gRNA[sce_tempA$gRNA %in% gRNAs_NT] <- "nontargeting"
for (j in 1:length(gRNAs_test)){
sce_temp1 <- sce_tempA[,grepl(gRNAs_test[j],sce_tempA$gRNA)]
sce_temp1$gRNA <- gRNAs_test[j]
sce_temp <- cbind(sce_tempA[,sce_tempA$gRNA=="nontargeting"],sce_temp1)
temp <- findMarkers(sce_temp,groups=sce_temp$gRNA,test.type="wilcox")
temp <- temp[[gRNAs_test[j]]]
temp$gRNA <- gRNAs_test[j]
temp$target_gene <- strsplit(gRNAs_test[j],"_")[[1]][1]
temp$DE_gene <- rownames(temp)
temp$DE_gene_name <- rowData(sce)$V2[match(temp$DE_gene,rowData(sce)$V1)]
markers[[j]] <- temp[order(temp$DE_gene),]
}
# correct for multiple testing
FDRs <- lapply(markers,function(x) x$FDR)
FDRs <- do.call(cbind,FDRs)
FDRs <- apply(FDRs,1,p.adjust)
for (j in 1:length(markers)){
markers[[j]]$FDR <- FDRs[j,]
}
markers <- do.call(rbind,markers)
write.table(markers,file=markers_file_all,sep=",",col.names = TRUE,row.names=FALSE)
write.table(markers[markers$FDR<0.1,],file=markers_file,sep=",",col.names = TRUE,row.names=FALSE)
For this section we assume that markers have already been computed for both the ABE and CBE data sets, using the code chunk above. We compute energy distances (Replogle, 2022. Cell), between targeting and non-targeting gRNAs, using iBAR-groups instead of cells. We use a basis of genes consisting of marker genes from both the CBE and ABE data sets (the same for both of the data sets), and normalise the energy distances by dividing them by the median of the energy distances found across different non-targeting gRNAs for the respective data set.
markers_all_ABE <- read.table("markersABE_all.csv",sep=",",header=TRUE)
markers_all_CBE <- read.table("markersCBE_all.csv",sep=",",header=TRUE)
markers_ABE_and_CBE <- rbind(markers_all_ABE,markers_all_CBE)
markers_AUC_ABE_and_CBE <- markers_ABE_and_CBE[abs(markers_ABE_and_CBE$summary.AUC-0.5)>0.2 & markers_ABE_and_CBE$FDR<0.1,]
basis_of_genes_ABE_and_CBE <- unique(markers_AUC_ABE_and_CBE$DE_gene_name)
reducedDims(sce_iBAR)$PCA_basis_ABE_and_CBE <- calculatePCA(sce_iBAR[rowData(sce_iBAR)$V2%in%basis_of_genes_ABE_and_CBE,],ncomponents=20,scale=TRUE)
ed <- rep(NA,length(gRNAs))
names(ed) <- gRNAs
sce_NT <- sce_iBAR[,sce_iBAR$gRNA%in%gRNAs_NT]
for (j in 1:length(gRNAs)){
pc_temp_1 <- reducedDims(sce_iBAR)$PCA_basis_ABE_and_CBE[grepl(gRNAs[j],sce_iBAR$gRNA),]
if (nrow(pc_temp_1) > 100){
pc_temp_1 <- pc_temp_1[sample(1:nrow(pc_temp_1),100),]
}
temp_vec <- c()
for (k in 1:length(unique(sce_NT$gRNA))){
pc_WT <- reducedDims(sce_NT)$PCA_basis_ABE_and_CBE[sce_NT$gRNA==unique(sce_NT$gRNA)[k],]
if (nrow(pc_WT) > 100){
pc_WT <- pc_WT[sample(1:nrow(pc_WT),100),]
}
temp_vec <- c(temp_vec,edist(rbind(pc_WT,pc_temp_1),c(nrow(pc_WT),nrow(pc_temp_1))))
}
ed[j] <- median(temp_vec)
}
ed_NT <- rep(NA,length(gRNAs_NT))
names(ed_NT) <- gRNAs_NT
for (j in 1:length(gRNAs_NT)){
sce_NT_temp <- sce_NT[,!(grepl(gRNAs_NT[j],sce_NT$gRNA))]
pc_temp_1 <- reducedDims(sce_NT)$PCA_basis_ABE_and_CBE[grepl(gRNAs_NT[j],sce_NT$gRNA),]
if (nrow(pc_temp_1) > 100){
pc_temp_1 <- pc_temp_1[sample(1:nrow(pc_temp_1),100),]
}
temp_vec <- c()
for (k in 1:length(unique(sce_NT_temp$gRNA))){
pc_WT <- reducedDims(sce_NT_temp)$PCA_basis_ABE_and_CBE[sce_NT_temp$gRNA==unique(sce_NT_temp$gRNA)[k],]
if (nrow(pc_WT) > 100){
pc_WT <- pc_WT[sample(1:nrow(pc_WT),100),]
}
temp_vec <- c(temp_vec,edist(rbind(pc_WT,pc_temp_1),c(nrow(pc_WT),nrow(pc_temp_1))))
}
ed_NT[j] <- median(temp_vec)
}
ed <- ed/median(ed_NT)
saveRDS(ed,file=file_energy_distance)
We add the energy distances to the gRNA meta-data.
ed <- readRDS(file_energy_distance)
validation_lib$ed <- ed[validation_lib$ID]
The following is a plot of the distribution of energy distance coloured by variant class.
ggplot(validation_lib,mapping=aes(x=log2(ed),color=variant_class)) + geom_density(size=1.5) + theme_classic(base_size=20) + labs(color="variant class")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))+scale_color_manual(values=colourscheme_variant_class)+xlab("log2(normalised ed)")
The following plot is coloured by whether the gRNA confers resistance or not (binary). Resistance includes drivers and drug addiction.
ggplot(validation_lib,mapping=aes(x=log2(ed),color=resistance_mutation)) + geom_density(size=1.5) + theme_classic(base_size=20) + labs(color="resistance mutation")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))+scale_color_manual(values=c("FALSE"="grey","TRUE"="purple"),labels=c("no resistance","resistance"))+xlab("log2(normalised ed)")
The following heatmap includes all targeting gRNAs that are the uniuqe gRNA for at least 10 iBAR groups and all genes that are differentially expressed compared to non-targeting control for at least one gRNA with FDR<0.1 and AUC<0.3 or AUC>0.7, where AUC refers to the area under the curve and significance was assessed by the Wilcoxon rank-sum test.
markers <- read.table(file=markers_file_all,sep=",",header=TRUE)
top_marker_genes <- unique(markers$DE_gene_name[markers$FDR<=0.1&abs(markers$summary.AUC-0.5)>0.2])
logcounts_top_markers <- logcounts(sce_iBAR[match(top_marker_genes,rowData(sce_iBAR)$V2),])
# averaging gene expression on per-gRNA basis
av_logcount_per_gRNA <- sapply(1:length(gRNAs),function(x) return(rowMeans(logcounts_top_markers[,grepl(gRNAs[x],sce_iBAR$gRNA)])))
colnames(av_logcount_per_gRNA) <- gRNAs
rownames(av_logcount_per_gRNA) <- top_marker_genes
av_logcount_per_gRNA_scaled <- t(apply(av_logcount_per_gRNA,1,function(x) (x-mean(x))/sd(x)))
df_logcounts <- data.frame(variant_class=validation_lib$variant_class[match(colnames(av_logcount_per_gRNA),validation_lib$ID)])
rownames(df_logcounts) <- colnames(av_logcount_per_gRNA)
cols_logcounts <- list(variant_class=colourscheme_variant_class)
library(viridis)
callback = function(hc, mat){
sv = svd(t(mat))$v[,1]
dend = reorder(as.dendrogram(hc), wts = sv)
as.hclust(dend)
}
library(dendsort)
callback = function(hc, ...){dendsort(hc)}
heatmap_markers <- pheatmap(av_logcount_per_gRNA_scaled,annotation_col = df_logcounts,annotation_colors = cols_logcounts,show_rownames = FALSE,
treeheight_row = 0,show_colnames = FALSE,color=colorRampPalette(c("Darkblue", "white","red"))(20),cutree_rows=2,cutree_cols = 4,clustering_callback = callback,cellheight = 0.2)
We summarise the expression within the gene modules in the heatmap and replot.
heatmap_condensed <- pheatmap(av_logcount_per_gRNA_scaled,annotation_col = df_logcounts,annotation_colors = cols_logcounts,show_rownames = FALSE,
treeheight_row = 0,show_colnames = FALSE,color=colorRampPalette(c("Darkblue", "white","red"))(20),cutree_rows=2,cutree_cols = 4,clustering_callback = callback,cluster_rows=TRUE,kmeans_k=2,cellheight = 20,treeheight_col = 100)
We obtain for each gRNA its cluster from the dendrogram in the above heatmaps and add it to the gRNA meta-data.
gRNA_clusters <- cutree(heatmap_markers$tree_col,4)
validation_lib$cluster <- NA
validation_lib$cluster[match(names(gRNA_clusters),validation_lib$ID)] <- gRNA_clusters
Finally, we replot the heatmap with the same order of gRNAs, only plotting the genes whose expression is most rank-correlated with energy distance, and restricting to gRNAs found to be resistant in the pooled screens, adding cluster and energy distance to the row annotations.
av_logcount_per_gRNA_scaled_ordered <- av_logcount_per_gRNA_scaled[,heatmap_markers$tree_col$order]
av_logcount_per_gRNA_scaled_ordered <- av_logcount_per_gRNA_scaled_ordered[,colnames(av_logcount_per_gRNA_scaled_ordered)%in%validation_lib$ID[validation_lib$variant_class%in%c("driver","drug addiction","canonical drug resistance")]]
#
# adjR_squared <- apply(av_logcount_per_gRNA_scaled,1,function(x) summary(lm(x~as.factor(gRNA_clusters)))$adj.r.squared)
# av_logcount_per_gRNA_scaled_ordered_sub <- av_logcount_per_gRNA_scaled_ordered[order(adjR_squared,decreasing=TRUE)[1:40],]
#most_DE_genes <- unique(markers$DE_gene_name[markers$FDR<10^-3&(markers$summary.AUC<0.2|markers$summary.AUC>0.8)&markers$Top<=20])
#finding the genes most correlated with energy distance
cor_with_ed <- cor(t(av_logcount_per_gRNA_scaled_ordered),ed[colnames(av_logcount_per_gRNA_scaled_ordered)],method="spearman")
av_logcount_per_gRNA_scaled_ordered_sub <- av_logcount_per_gRNA_scaled_ordered[order(abs(cor_with_ed),decreasing=TRUE)[1:20],]
df_logcounts_sub <- df_logcounts
cols_logcounts_sub <- cols_logcounts
cols_logcounts_sub$cluster <- c("4"="purple","2"="blue","3"="black","1"="lightgrey")
df_logcounts_sub$cluster <- as.factor(cutree(heatmap_markers$tree_col,4)[rownames(df_logcounts_sub)])
df_logcounts_sub$log2_ed <- log2(ed[rownames(df_logcounts_sub)])
df_logcounts_sub <- df_logcounts_sub[colnames(av_logcount_per_gRNA_scaled_ordered_sub),]
rowgaps <- c(sum(df_logcounts_sub$cluster==1),sum(df_logcounts_sub$cluster%in%c(1,2)),
sum(df_logcounts_sub$cluster%in%c(1,2,3)), sum(df_logcounts_sub$cluster!=4))
pheatmap(t(av_logcount_per_gRNA_scaled_ordered_sub),annotation_row = df_logcounts_sub,annotation_colors = cols_logcounts_sub,treeheight_row = 0,
show_rownames=TRUE,color=colorRampPalette(c("Darkblue", "white","red"))(20),
cluster_rows=FALSE,cutree_cols=2,treeheight_col=0,clustering_callback = callback,gaps_row = rowgaps)
We perform PCA for further downstream analysis and visualisation (energy distance, UMAP, diffusion score).
We use the genes identified above as differentially expressed between a gRNA and non-targeting controls as a basis of genes on which to perform principal component analysis (PCA). Genes are included if for at least one gRNA they have an FDR<0.1 and an AUC<0.3 or >0.7.
markers <- read.table(markers_file_all,sep=",",header=TRUE)
markers_AUC <- markers[abs(markers$summary.AUC-0.5)>0.2 & markers$FDR<0.1,]
basis_of_genes <- unique(markers_AUC$DE_gene_name)
reducedDims(sce_iBAR)$PCA_basis <- calculatePCA(sce_iBAR[rowData(sce_iBAR)$V2%in%basis_of_genes,],ncomponents=20,scale=TRUE)
sce_iBAR <- runUMAP(sce_iBAR,dimred="PCA_basis")
#saveRDS(sce_iBAR,file=sce_save_file)
We now plot the UMAP representation of the data. Each iBAR-group is a dot. For iBAR-groups with several gRNAs, variant classes are ranked by their impact as follows: driver/drug addiction > canonical drug resistance > control. iBAR groups with both driver and drug addiction conferring gRNAs are discarded. Colours indicate the class with the highest impact for each cell.
sce_iBAR <- sce_iBAR[,!(grepl("driver",sce_iBAR$variant_class)&grepl("drug addiction",sce_iBAR$variant_class))]
sce_iBAR$variant_class_highest_impact <- "control"
sce_iBAR$variant_class_highest_impact[grepl("canonical drug resistance",sce_iBAR$variant_class)] <- "canonical drug resistance"
sce_iBAR$variant_class_highest_impact[grepl("drug addiction",sce_iBAR$variant_class)] <- "drug addiction"
sce_iBAR$variant_class_highest_impact[grepl("driver",sce_iBAR$variant_class)] <- "driver"
plotReducedDim(cbind(sce_iBAR[,sce_iBAR$variant_class_highest_impact=="control"],sce_iBAR[,sce_iBAR$variant_class_highest_impact!="control"]),colour_by="variant_class_highest_impact",dimred="UMAP") +
scale_colour_manual(values=colourscheme_variant_class)+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))
We plot the correlation of differential expression from non-targeting controls (measured by the AUC) across all classes of resistance targets.
cor_effect <- matrix(0,nrow=length(gRNAs_resistance_driver),ncol=length(gRNAs_resistance_driver))
rownames(cor_effect) <- gRNAs_resistance_driver
colnames(cor_effect) <- gRNAs_resistance_driver
for (j in 2:length(gRNAs_resistance_driver)){
for (k in 1:(j-1)){
cor_effect[j,k] <- cor(markers$summary.AUC[markers$gRNA== gRNAs_resistance_driver[k]],
markers$summary.AUC[markers$gRNA== gRNAs_resistance_driver[j]])
}
}
cor_effect <- cor_effect+t(cor_effect)
for (j in 1:nrow(cor_effect)){
cor_effect[j,j] <- 1
}
df_row_cor <- data.frame(variant_class=validation_lib$variant_class[match(colnames(cor_effect),validation_lib$ID)],
gene=validation_lib$Gene[match(colnames(cor_effect),validation_lib$ID)])
rownames(df_row_cor) <- rownames(cor_effect)
col_genes <- brewer.pal(length(unique(df_row_cor$gene)),"Dark2")
names(col_genes) <- unique(df_row_cor$gene)
cols_cor <- list(variant_class=colourscheme_variant_class,gene=col_genes)
pheatmap(cor_effect,annotation_col = df_row_cor,annotation_colors = cols_cor)
We restrict the plot to drivers/drug addiction
gRNAs_DA <- intersect(gRNAs_resistance_driver,validation_lib$ID[validation_lib$variant_class!="canonical drug resistance"])
pheatmap(cor_effect[gRNAs_DA,gRNAs_DA],annotation_col = df_row_cor,annotation_colors = cols_cor)
We use DE expression data (log2-fold change) for patients with PFS > 6 months and patients with PFS < 6 months from the following publication.
Tian et al. (2023). Combined PD-1, BRAF and MEK inhibition in BRAFV600E colorectal cancer: a phase 2 trial. Nature Medicine.
First, we identify the 5000 genes most differently ranked for PFS<6m versus PFS>6m.
DEG_NR <- read.table("../../Tian_et_al_NR.csv",sep=",",header=TRUE)
DEG_R <- read.table("../../Tian_et_al_R.csv",sep=",",header=TRUE)
lm_R_vs_NR <- lm(DEG_R$avg_log2FC~DEG_NR$avg_log2FC)
genes_5000_most_diff <- DEG_NR$gene[order(abs(lm_R_vs_NR$residuals),decreasing =TRUE)[1:5000]]
We compute rank correlation between our differential expression results and Tian et al.’s DE results for patients with progression free survival (PFS) < 6 months, based on the 5000 genes identified above (PFS_outcome score).
cor_R <- rep(0,length(gRNAs_resistance_driver))
names(cor_R) <- gRNAs_resistance_driver
cor_NR <- rep(0,length(gRNAs_resistance_driver))
names(cor_NR) <- gRNAs_resistance_driver
for (j in 1:length(gRNAs_resistance_driver)){
markers_temp <- markers[markers$gRNA==gRNAs_resistance_driver[j],]
markers_temp <- markers_temp[match(DEG_R$gene,markers_temp$DE_gene_name),]
markers_temp <- markers_temp[match(genes_5000_most_diff,markers_temp$DE_gene_name),]
cor_R[j] <- cor(markers_temp$summary.AUC,DEG_R$avg_log2FC[match(genes_5000_most_diff,DEG_R$gene)],method="spearman")
cor_NR[j] <- cor(markers_temp$summary.AUC,DEG_NR$avg_log2FC[match(genes_5000_most_diff,DEG_NR$gene)],method="spearman")
}
df_cor_R <- data.frame(cor_R=cor_R,gRNA=names(cor_R),variant_type=validation_lib$variant_class[match(names(cor_R),validation_lib$ID)])
df_cor_R <- df_cor_R[order(cor_R),]
df_cor_R$gRNA <- factor(df_cor_R$gRNA,levels=df_cor_R$gRNA)
ggplot(df_cor_R,mapping=aes(y=cor_R,x=gRNA,fill=variant_type)) + geom_bar(stat="identity") + coord_flip()+scale_fill_manual(values=colourscheme_variant_class[names(colourscheme_variant_class)!="control"])+theme_classic()+ylab("PFS_outcome score")
We add the PFS_outcome score to the SingleCellExperiment, for iBAR clones with one resistance gRNA and no other gRNAs.
sce_iBAR$PFS_outcome_score <- NA
for (j in 1:length(cor_R)){
sce_iBAR$PFS_outcome_score[sce_iBAR$gRNA == names(cor_R)[j]] <- cor_R[j]
}
We compute diffusion scores as in Cooper, Coelho, Strauss et al. (2023) bioRxiv. (https://doi.org/10.1101/2023.05.22.541777) and Haghverdi et al. (2015). Bioinformatics, for those resistance conferring gRNAs that are the unique gRNA in at least 10 iBAR groups.
sce_iBAR_resistance <- sce_iBAR[,sce_iBAR$variant_class_highest_impact!="control"]
markers_AUC$variant_class <- validation_lib$variant_class[match(markers_AUC$gRNA,validation_lib$ID)]
markers_AUC_DA <- markers_AUC[markers_AUC$variant_class!="canonical drug resistance",]
nr_DE_genes <- table(markers_AUC_DA$gRNA)
sce_iBAR_resistance$nr_DE_genes <- 0
for (j in 1:length(nr_DE_genes)){
sce_iBAR_resistance$nr_DE_genes[grepl(names(nr_DE_genes)[j],sce_iBAR_resistance$gRNA)] <- sce_iBAR_resistance$nr_DE_genes[grepl(names(nr_DE_genes)[j],sce_iBAR_resistance$gRNA)] + nr_DE_genes[j]
}
dm <- DiffusionMap(reducedDims(sce_iBAR_resistance)$PCA_basis)
if (cor(dm$DC1,sce_iBAR_resistance$nr_DE_genes) < 0){
eigenvectors(dm) <- -eigenvectors(dm)
}
sce_iBAR_resistance$diffusion_score <- dm$DC1
xx <- sample(1:length(dm$DC1))
tmp <- data.frame(DC1 = eigenvectors(dm)[xx, 1],
DC2 = eigenvectors(dm)[xx, 2],
target_gene =sce_iBAR_resistance$target_gene[xx],
gRNA =sce_iBAR_resistance$gRNA[xx],
nr_DE_genes=sce_iBAR_resistance$nr_DE_genes[xx],
dpt=dm$DC1[xx])
The following is a diffusion map coloured by the diffusion score.
p1c <- ggplot(tmp, aes(x = DC1, y = DC2, colour = dpt)) +
geom_point() +
xlab("Diffusion component 1") +
ylab("Diffusion component 2") +
theme_classic(base_size=11) + theme(legend.position = "bottom",legend.box="vertical",legend.margin=margin())+
labs(color="")+scale_color_viridis_c(option="magma")+ggtitle("diffusion score")
print(p1c)
The following plot shows the distribution of diffusion scores for each driver/drug addiction gRNA, ordered horizontally by their mean diffusion scores. For several of the gRNAs the diffusion scores have a multi-modal distribution, indicating either differences in editing between iBAR-groups of the same gRNA, or different responses to the same edits.
sce_iBAR_resistance_one_gRNA <- sce_iBAR_resistance[,!(grepl("-",sce_iBAR_resistance$gRNA))]
df_gRNA_ds_mean <- unlist(lapply(1:length(gRNAs_resistance_driver),function(x) mean(sce_iBAR_resistance_one_gRNA$diffusion_score[sce_iBAR_resistance_one_gRNA$gRNA==gRNAs_resistance_driver[x]])))
names(df_gRNA_ds_mean ) <- gRNAs_resistance_driver
levels_a <- names(df_gRNA_ds_mean)[order(df_gRNA_ds_mean,decreasing=TRUE)]
df <- data.frame(diffusion_score=sce_iBAR_resistance_one_gRNA$diffusion_score,gRNA=sce_iBAR_resistance_one_gRNA$gRNA)
split_gRNAs <- function(line_df){
output <- line_df
if (sum(grepl("-",line_df$gRNA))>=1)
{
w <- strsplit(line_df$gRNA,"-")[[1]]
for (k in 1:length(w)){
output <- rbind(output,unlist(c(line_df[1],w[k])))
}
}
return(output)
}
split_gRNAs_df <- function(d_f){
output <- c()
for (j in 1:nrow(d_f)){
output <- rbind(output,split_gRNAs(d_f[j,])) }
output <- output[!(grepl("-",output$gRNA)),]
}
df <- split_gRNAs_df(df)
df <- df[df$gRNA%in%gRNAs_resistance_driver,]
df$diffusion_score <- as.double(df$diffusion_score)
df$mean_diffusion_score <- df_gRNA_ds_mean[as.vector(df$gRNA)]
df$variant_class <- validation_lib$variant_class[match(as.vector(df$gRNA),validation_lib$ID)]
df$gRNA=factor(df$gRNA,levels=levels_a)
y_lab_colours <- colourscheme_variant_class[validation_lib$variant_class[match(levels_a,validation_lib$ID)]]
p2 <- ggplot(as.data.frame(df),aes(x=diffusion_score,y=gRNA,fill=mean_diffusion_score)) + geom_density_ridges() + scale_fill_viridis_c(option="magma")+
theme_classic()+theme(axis.text.y = element_text(color=y_lab_colours))+ylab("")
print(p2)
Now we re-colour the plot above by PFS_outcome score, keeping the horizontal ordering of the gRNAs as above.
df_unique_gRNA <- df[!(grepl("-",df$gRNA)),]
df_unique_gRNA$PFS_outcome_score <- cor_R[as.vector(df_unique_gRNA$gRNA)]
p3 <- ggplot(as.data.frame(df_unique_gRNA),aes(x=diffusion_score,y=gRNA,fill=PFS_outcome_score)) + geom_density_ridges() + scale_fill_viridis_c()+
theme_classic()+theme(axis.text.y = element_text(color=y_lab_colours))+ylab("")
print(p3)
The following scatterplot compares average diffusion scores for each gRNA to survival scores, illustrating the high negative correlation between the two scores.
library(ggrepel)
df_scatter_diffusion_PFS_outcome <- data.frame(diffusion_score=df_gRNA_ds_mean,PFS_outcome_score=cor_R[names(df_gRNA_ds_mean)],
gRNA=names(df_gRNA_ds_mean),gene=sapply(names(df_gRNA_ds_mean),function(x) strsplit(x,"_")[[1]][1]))
ggplot() + geom_point(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score,label=gRNA,color=gene))+ geom_smooth(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score),alpha=0.2)+theme_classic()+geom_text_repel(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score,label=gRNA,color=gene))+theme(legend.position = "None")
Correlation between diffusion and PFS_outcome score: -0.8910845.
First we compute PROGENy scores.
(M Schubert et al. 2018. Perturbation-Response Genes Reveal Signaling Footprints in Cancer Gene Expression. Nature Communications.
Holland CH et al. 2020. Robustness and applicability of transcription factor and pathway analysis tools on single-cell RNA-seq data. Genome Biology. )
logcounts_sce_iBAR <- logcounts(sce_iBAR)
rownames(logcounts_sce_iBAR) <- rowData(sce_iBAR)$V2
progeny_scores <- progeny(as.matrix(logcounts_sce_iBAR),scale=FALSE)
We transform the scores linearly such that their mean across the nontargeting control gRNAs is 0 and the standard deviation across the same barcodes is equal to 1
progeny_scores_1gRNA <- progeny_scores[sapply(rownames(progeny_scores), function(x) str_count(x,"-") == 1),]
progeny_scores_nontargeting <- progeny_scores_1gRNA[grepl("non_",rownames(progeny_scores_1gRNA)),]
mean_nontargeting <- colMeans(progeny_scores_nontargeting)
sd_nontargeting <- apply(progeny_scores_nontargeting,2,sd)
progeny_scores <- t(apply(progeny_scores,1,function(v) (v-mean_nontargeting)/sd_nontargeting))
We plot the distribution of PROGENy scores split by variant class.
colData(sce_iBAR) <- cbind(colData(sce_iBAR),progeny_scores)
for (j in 1:ncol(progeny_scores)){
df_temp <- data.frame(score=progeny_scores[,j],variant_class=sce_iBAR$variant_class_highest_impact)
if (yy=="ABE"){
df_temp <- df_temp[df_temp$variant_class!="driver",]
} #as very few driver iBARs in the data
print(ggplot(df_temp,aes(x=score,color=variant_class)) + geom_density(size=1) + theme_classic(base_size=12)+ scale_color_colorblind()+ggtitle(colnames(progeny_scores)[j])+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=2)))+scale_color_manual(values=colourscheme_variant_class[unique(df_temp$variant_class)]))
}
gRNA_pathway_mean <- do.call(rbind,lapply(1:length(gRNAs),function(x) colMeans(progeny_scores[grepl(gRNAs[x],rownames(progeny_scores)),,drop=FALSE])))
rownames(gRNA_pathway_mean) <- gRNAs
The following heatmap illustrates mean PROGENy scores for all targeting gRNAs.
df_row_pathway <- data.frame(variant_class=validation_lib$variant_class[match(gRNAs,validation_lib$ID)])
rownames(df_row_pathway) <- gRNAs
colors_pathway_heatmap <- list(variant_class=colourscheme_variant_class[unique(df_row_pathway$variant_class)])
max_val <- max(gRNA_pathway_mean)
min_val <- min(gRNA_pathway_mean)
nr_colors <- 30
col_pathway = colorRampPalette(c("Darkblue", "white","red"))(nr_colors)
breaks_pathway = c(seq(min_val, 0,
length.out=ceiling(nr_colors/2) + 1),
seq(max_val/nr_colors,
max_val, length.out=floor(nr_colors/2)))
pheatmap(t(gRNA_pathway_mean),annotation_col=df_row_pathway,annotation_colors = colors_pathway_heatmap ,col=col_pathway,breaks=breaks_pathway,show_colnames = FALSE,treeheight_col = 100,cutree_cols=3,cellheight = 10,treeheight_row = 0)
Now we zoom in on the gRNAs that confer resistance or drug addiction
pheatmap(t(gRNA_pathway_mean[gRNAs_resistance_driver,]),annotation_col=df_row_pathway,annotation_colors = colors_pathway_heatmap ,col=col_pathway,breaks=breaks_pathway,show_colnames = TRUE,cutree_cols=3,cellheight = 10,treeheight_row = 0)
First, we compare each driver, drug addiction and resistance related gRNA to non-targeting controls.
sce_iBAR_nontargeting <- sce_iBAR[,sce_iBAR$target_gene == "non_targeting"]
iBARs_nontargeting <- colnames(sce_iBAR_nontargeting)
markers_progeny <- list()
target_resistance_driver <- unique(sapply(gRNAs_resistance_driver,function(x) strsplit(x,"_")[[1]][1]))
for (j in 1:length(gRNAs_resistance_driver)){
iBARs_temp <- colnames(sce_iBAR)[grepl(gRNAs_resistance_driver[j],sce_iBAR$gRNA)]
scores_temp <- t(progeny_scores[c(iBARs_temp,iBARs_nontargeting),])
group_temp <- c(rep(gRNAs_resistance_driver[j],length(iBARs_temp)),
rep("nontargeting control",length(iBARs_nontargeting)))
markers_progeny[[j]] <- findMarkers(scores_temp,groups=group_temp,test.type="wilcox")[[gRNAs_resistance_driver[j]]]
markers_progeny[[j]]$gRNA <- gRNAs_resistance_driver[j]
markers_progeny[[j]]$pathway <- rownames(markers_progeny[[j]])
markers_progeny[[j]] <- markers_progeny[[j]][order(markers_progeny[[j]]$pathway),]
}
# correct for multiple testing
FDRs <- lapply(markers_progeny,function(x) x$FDR)
FDRs <- do.call(cbind,FDRs)
FDRs <- apply(FDRs,1,p.adjust)
for (j in 1:length(markers)){
markers_progeny[[j]]$FDR <- FDRs[j,]
}
markers_progeny <- do.call(rbind,markers_progeny)
markers_progeny <- markers_progeny[order(markers_progeny$pathway),]
write.table(markers_progeny,file=markers_file_progeny,sep=",",col.names = TRUE,row.names=FALSE)
We summarise the above analysis in a heatmap plot. Significant differential expression is marked by a dot.
df <- as.data.frame(markers_progeny)
df$sig <- "not_significant"
df$sig[df$FDR<0.1] <- "significant"
pp <- ggplot(df,mapping = aes(y = pathway, x = gRNA,size=sig)) +
geom_raster(aes(fill=summary.AUC)) +
labs(y="", x="") +
theme_bw(base_size=16) + theme(axis.text.x=element_text(size=14, angle=90, vjust=0.3),
axis.text.y=element_text(size=14),
plot.title=element_text(size=15)) + labs(fill="AUC")+
scale_fill_gradient2(midpoint=0.5,limits=c(0,1),
oob = scales::squish,low="darkblue",high="orange") + geom_point() +
scale_size_manual(values=c(significant=2, not_significant=NA),guide="none")
print(pp+ggtitle("DE for PROGENy scores compared to nontargeting\ncontrol"))
We plot up to 20 upregulated and 20 downregulated genes (the most significant ones), using the markers computed previously.
for (j in 1:length(gRNAs_DA)){
markers_temp <- markers[markers$gRNA==gRNAs_DA[j],]
sig <- rep("not significant",nrow(markers_temp))
sig[markers_temp$FDR<0.1&markers_temp$AUC<0.5] <- "down"
sig[markers_temp$FDR<0.1&markers_temp$AUC>0.5] <- "up"
df_temp <- data.frame(AUC=markers_temp$AUC.nontargeting,FDR=markers_temp$FDR,gene_name=markers_temp$DE_gene_name,
sig=sig)
df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$AUC>0.5,]
df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],]
df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$AUC<0.5,]
df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],]
pp <- ggplot(df_temp, aes(x=AUC, y=-log10(FDR),color=sig)) +
geom_point() +
ggtitle(gRNAs_DA[j]) +
labs(y=expression('-Log'[10]*' FDR'), x=expression('AUC')) +
theme_classic(base_size=20) +
theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+
geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = AUC, y = -log10(FDR),label=gene_name),max.overlaps=100)+
scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+
geom_vline(xintercept = 0.5)+geom_hline(yintercept = 1)
print(pp)
}
Now we perform differential expression across the variant types. To give equal weight to each gRNA, we subsample the same number of iBAR groups for each gRNA for any gRNA with more than 20 iBAR groups.
sce_canonical_drug_resistance <- sce_iBAR[,sce_iBAR$variant_class_highest_impact=="canonical drug resistance"]
gRNAs_canonical_drug_resistance <- validation_lib$ID[validation_lib$variant_class=="canonical drug resistance"]
sce_canonical_drug_resistance_balanced <- sce_iBAR[,sce_iBAR$gRNA==""]
for (j in 1:length(gRNAs_canonical_drug_resistance)){
xx_temp <- colnames(sce_iBAR)[grepl(gRNAs_canonical_drug_resistance[j],sce_iBAR$gRNA)]
if (length(xx_temp)>=20){
xx_temp <- sample(xx_temp,20)}
sce_canonical_drug_resistance_balanced <- cbind(sce_canonical_drug_resistance_balanced ,sce_iBAR[,xx_temp])
}
aa <- ifelse(yy=="ABE","drug addiction","driver")
sce_other_impact <- sce_iBAR[,sce_iBAR$variant_class_highest_impact==aa]
gRNAs_other_impact <- validation_lib$ID[validation_lib$variant_class==aa]
sce_other_impact_balanced <- sce_iBAR[,sce_iBAR$gRNA==""]
for (j in 1:length(gRNAs_other_impact)){
xx_temp <- colnames(sce_iBAR)[grepl(gRNAs_other_impact[j],sce_iBAR$gRNA)]
if (length(xx_temp)>=20){
xx_temp <- sample(xx_temp,20)}
sce_other_impact_balanced <- cbind(sce_other_impact_balanced ,sce_iBAR[,xx_temp])
}
markers_other_impact_versus_canonical <- findMarkers(cbind(sce_canonical_drug_resistance_balanced,sce_other_impact_balanced),
groups=c(rep("canonical",ncol(sce_canonical_drug_resistance_balanced)),
rep(aa,ncol(sce_other_impact_balanced))),test.type="wilcox")[[aa]]
markers_other_impact_versus_canonical$gene <- rowData(sce_iBAR)[match(rownames(markers_other_impact_versus_canonical),rownames(sce_iBAR)),2]
sig <- rep("not significant",nrow(markers_other_impact_versus_canonical))
sig[markers_other_impact_versus_canonical$FDR<0.1&markers_other_impact_versus_canonical$summary.AUC<0.5] <- "down"
sig[markers_other_impact_versus_canonical$FDR<0.1&markers_other_impact_versus_canonical$summary.AUC>0.5] <- "up"
df_temp <- data.frame(AUC=markers_other_impact_versus_canonical$summary.AUC,FDR=markers_other_impact_versus_canonical$FDR,
gene_name=markers_other_impact_versus_canonical$gene,sig=sig)
df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$AUC>0.5,]
df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],]
df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$AUC<0.5,]
df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],]
pp <- ggplot(df_temp, aes(x=AUC, y=-log10(FDR),color=sig)) +
geom_point() +
ggtitle(paste0(aa," versus canonical resistance")) +
labs(y=expression('-Log'[10]*' FDR'), x=expression('AUC')) +
theme_classic(base_size=20) +
theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+
geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = AUC, y = -log10(FDR),label=gene_name),max.overlaps=100)+
scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+
geom_vline(xintercept = 0.5)+geom_hline(yintercept = 1)
print(pp)
We compare PROGENy scores across the variant types. To give equal weight to each gRNA, we subsample the same number of iBAR groups for each gRNA, for any gRNA with more than 20 iBAR groups.
progeny_scores_sub <- progeny_scores[c(colnames(sce_canonical_drug_resistance_balanced)
,colnames(sce_other_impact_balanced)),]
markers_progeny_other_impact_versus_canonical <- findMarkers(t(progeny_scores_sub),groups=c(rep("canonical",ncol(sce_canonical_drug_resistance_balanced)),
rep(aa,ncol(sce_other_impact_balanced))),test.type="wilcox")[[aa]]
sig <- rep("not significant",nrow(markers_progeny_other_impact_versus_canonical))
sig[markers_progeny_other_impact_versus_canonical$FDR<0.1&markers_progeny_other_impact_versus_canonical$summary.AUC<0.5] <- "down"
sig[markers_progeny_other_impact_versus_canonical$FDR<0.1&markers_progeny_other_impact_versus_canonical$summary.AUC>0.5] <- "up"
df_temp <- data.frame(AUC=markers_progeny_other_impact_versus_canonical$summary.AUC,FDR=markers_progeny_other_impact_versus_canonical$FDR,
gene_name=rownames(markers_progeny_other_impact_versus_canonical),sig=sig)
df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$AUC>0.5,]
df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],]
df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$AUC<0.5,]
df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],]
pp <- ggplot(df_temp, aes(x=AUC, y=-log10(FDR),color=sig)) +
geom_point() +
ggtitle(paste0(aa," versus canonical resistance\nPROGENy")) +
labs(y=expression('-Log'[10]*' FDR'), x=expression('AUC')) +
theme_classic(base_size=20) +
theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+
geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = AUC, y = -log10(FDR),label=gene_name),max.overlaps=100)+
scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+
geom_vline(xintercept = 0.5)+geom_hline(yintercept = 1)
print(pp)
We investigate whether there is a difference in proportion of cell cycle phases across variant types. First, we plot the relation between cell cycle and variant class.
sce_iBAR <- cell_cycle_scoring(sce_iBAR)
df_cc <- data.frame(variant_class=sce_iBAR$variant_class_highest_impact,phase=sce_iBAR$phase)
cc_count <- df_cc %>% dplyr::group_by_all() %>% dplyr::count()
if (yy=="ABE"){
cc_count <- cc_count[cc_count$variant_class!="driver",]#remove driver for ABE for lack of data
}
cc_count
## # A tibble: 9 × 3
## # Groups: variant_class, phase [9]
## variant_class phase n
## <chr> <chr> <int>
## 1 canonical drug resistance G1 144
## 2 canonical drug resistance G2M 71
## 3 canonical drug resistance S 56
## 4 control G1 4686
## 5 control G2M 2112
## 6 control S 685
## 7 driver G1 230
## 8 driver G2M 117
## 9 driver S 70
ggplot(df_cc,mapping=aes(x=variant_class,fill=phase)) + geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..])) + scale_fill_manual(values=c("G1"="darkgrey","G2M"="black","S"="purple"))+theme_bw(base_size=12) + ylab("proportion in phase")
Now we plot the relation between cell cycle phase and resistance as a binary variable.
sce_iBAR$resistance_mutation <- grepl("TRUE",sce_iBAR$resistance_mutation) #at least one resistance conferring gRNA in the cell
df_cc <- data.frame(resistance=sce_iBAR$resistance_mutation ,phase=sce_iBAR$phase)
df_cc$resistance <- ifelse(df_cc$resistance,"resistance","no resistance")
ggplot(df_cc,mapping=aes(x=resistance,fill=phase)) + geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..])) + scale_fill_manual(values=c("G1"="darkgrey","G2M"="black","S"="purple"))+theme_bw(base_size=12) + ylab("proportion in phase")
We test whether there is a significant change in the number of iBAR groups for each cell cycle phase between gRNAs associated with resistance, and those not associated with resistance.
df_cc_count <- df_cc %>% group_by_all() %>% count()
table_compare_cc <- reshape2::acast(df_cc_count,phase~resistance)
print(chisq.test(table_compare_cc))
##
## Pearson's Chi-squared test
##
## data: table_compare_cc
## X-squared = 63.696, df = 2, p-value = 1.474e-14
#saveRDS(sce_iBAR,file=sce_save_file)
First, we compute MAYA activity scores (Landais and Vallot (2023). Multi-modal quantification of pathway activity with MAYA. Nat Commun.). MAYA computes for each gene set a linear low dimensional representation of gene expression using principal component analysis. For each gene set and each principal component it computes a score for each cell in the data set. It keeps those scores that are bimodal across the data set.
library(MAYA)
counts_sce_iBAR <- counts(sce_iBAR)
rownames(counts_sce_iBAR) <- rowData(sce_iBAR)[,2]
activity_summary<-MAYA_pathway_analysis(expr_mat=counts_sce_iBAR,
modules_list = "hallmark",
is_logcpm=F)
activity_summary_ordered <- activity_summary$activity_matrix[,order(sce_iBAR$variant_class_highest_impact)]
anno_col = colData(sce_iBAR)[,c("variant_class_highest_impact"),drop=FALSE]
colnames(anno_col) <- "variant_class"
anno_col <- anno_col[colnames(activity_summary_ordered),,drop=FALSE]
anno_col <- as.data.frame(anno_col)
anno_colours <- list(variant_class=colourscheme_variant_class)
We identify modules that are differentially expressed for iBAR groups with resistance/addiction/driver phenotypes compared to non-targeting control gRNAs
markers_MAYA_hallmark <- list()
scores_nontargeting <- activity_summary_ordered[,iBARs_nontargeting]
for (j in 1:length(gRNAs_resistance_driver)){
iBARs_temp <- colnames(sce_iBAR)[grepl(gRNAs_resistance_driver[j],colnames(sce_iBAR))]
scores_temp <- cbind(activity_summary_ordered[,iBARs_temp],scores_nontargeting)
group_temp <- c(rep(gRNAs_resistance_driver[j],length(iBARs_temp)),
rep("nontargeting control",length(iBARs_nontargeting)))
markers_MAYA_hallmark[[j]] <- findMarkers(scores_temp,groups=group_temp,test.type="wilcox")[[gRNAs_resistance_driver[j]]]
markers_MAYA_hallmark[[j]]$gRNA <- gRNAs_resistance_driver[j]
markers_MAYA_hallmark[[j]]$pathway <- rownames(markers_MAYA_hallmark[[j]])
markers_MAYA_hallmark[[j]] <- markers_MAYA_hallmark[[j]][order(markers_MAYA_hallmark[[j]]$pathway),]
}
# correct for multiple testing
FDRs <- lapply(markers_MAYA_hallmark,function(x) x$FDR)
FDRs <- do.call(cbind,FDRs)
FDRs <- apply(FDRs,1,p.adjust)
for (j in 1:length(markers)){
markers_MAYA_hallmark[[j]]$FDR <- FDRs[j,]
}
markers_MAYA_hallmark <- do.call(rbind,markers_MAYA_hallmark)
markers_MAYA_hallmark <- markers_MAYA_hallmark[order(markers_MAYA_hallmark$pathway),]
write.table(markers_MAYA_hallmark,file=markers_file_MAYA_hallmark,sep=",",col.names = TRUE,row.names=FALSE)
We summarise the above analysis in a heatmap plot. Significant differential expression is marked by a dot.
df <- as.data.frame(markers_MAYA_hallmark)
df$sig <- "not_significant"
df$sig[df$FDR<0.1] <- "significant"
pp <- ggplot(df,mapping = aes(y = pathway, x = gRNA,size=sig)) +
geom_raster(aes(fill=summary.AUC)) +
labs(y="", x="") +
theme_bw() + theme(axis.text.x=element_text(size=9, angle=90, vjust=0.3),
axis.text.y=element_text(size=9),
plot.title=element_text(size=11)) + labs(fill="AUC")+
scale_fill_gradient2(midpoint=0.5,limits=c(0,1),
oob = scales::squish,low="darkblue",high="orange") + geom_point() +
scale_size_manual(values=c(significant=2, not_significant=NA),guide="none")
print(pp+ggtitle("DE for MAYA_hallmark scores compared to nontargeting control"))
For the pathways with significant alteration for any resistance conferring target, we plot the differential expression levels compared to nontargeting control for the 30 genes with the highest weight for the relevant PC identified by MAYA.
gRNAs_MAYA_hallmark.keep <- unique(markers_MAYA_hallmark$gRNA[markers_MAYA_hallmark$FDR<0.1])
hallmark_sets <- names(activity_summary$PCA_obj)
for (j in 1:length(hallmark_sets)){
gene_contributions <- activity_summary$PCA_obj[[j]]$gene_contrib
for (k in 1:nrow(gene_contributions)){
gene_contributions_2 <- gene_contributions[k,]
gene_contributions_1 <- gene_contributions_2[order(gene_contributions_2,decreasing=TRUE)[1:30]]
markers_temp <- markers[markers$FDR<0.1&markers$gRNA%in%gRNAs_MAYA_hallmark.keep&markers$DE_gene_name%in%names(gene_contributions_1),]
df <- as.data.frame(markers_temp)
pp <- ggplot(df,mapping = aes(y = DE_gene_name, x = gRNA)) +
geom_raster(aes(fill=summary.AUC)) +
labs(y="", x="") +
theme(axis.text.x=element_text(size=9, angle=90, vjust=0.3),
axis.text.y=element_text(size=9),
plot.title=element_text(size=11)) + labs(fill="AUC")+
scale_fill_gradient2(midpoint=0.5,limits=c(0,1),
oob = scales::squish) +theme_bw()+ggtitle(paste0(hallmark_sets[j],"-mode ",k))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(pp)
gene_contributions_1 <- gene_contributions_1[unique(df$DE_gene_name)]
gene_contributions_1 <- gene_contributions_1[order(names(gene_contributions_1))]
pp2 <- ggplot(mapping=aes(x=factor(names(gene_contributions_1),levels=names(gene_contributions_1)),y=gene_contributions_1,fill=gene_contributions_1>0)) + geom_bar(stat="identity")+theme_classic()+ggtitle(paste0(hallmark_sets[j],"-mode ",k))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+xlab("")+ylab("PC weight of gene")+
scale_fill_manual(values=c("TRUE"="blue","FALSE"="red"))+theme(legend.position = "none")
print(pp2)
}
}
We compare PFS outcome scores across the different variant classes.
gRNAs_AUC <- sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))]
gRNAs_AUC <- names(table(gRNAs_AUC))[table(gRNAs_AUC)>=10]
AUC_all <- matrix(NA,ncol=length(gRNAs_AUC),nrow=nrow(sce_iBAR))
names(AUC_all) <- gRNAs_AUC
for (j in 1:length(gRNAs_AUC)){
sce_temp <- sce_iBAR[,sce_iBAR$gRNA%in%c(gRNAs_AUC[j],gRNAs_NT)]
sce_temp$gRNA[sce_temp$gRNA %in% setdiff(gRNAs_NT,gRNAs_AUC[j])] <- "nontargeting"
AUC_all[,j] <- findMarkers(sce_temp,groups=sce_temp$gRNA,test.type="wilcox")[[gRNAs_AUC[j]]][rownames(sce_temp),]$summary.AUC
}
cor_R_all <- rep(NA,length(gRNAs_AUC))
names(cor_R_all) <- gRNAs_AUC
for (j in 1:length(gRNAs_AUC)){
cor_R_all[j] <- cor(AUC_all[match(genes_5000_most_diff,rowData(sce_iBAR)$V2),j],DEG_R$avg_log2FC[match(genes_5000_most_diff,DEG_R$gene)],method="spearman")
}
df_cor_R_all <- data.frame(gRNA=names(cor_R_all),PFS_outcome_score=cor_R_all,variant_class=validation_lib$variant_class[match(names(cor_R_all),validation_lib$ID)])
df_cor_R_all$variant_class[grepl("non_targeting",df_cor_R_all$gRNA)] <- "non_targeting_control"
colourscheme_temp <- c(colourscheme_variant_class,"non_targeting_control"="black")
if (yy=="ABE"){
colourscheme_temp <- colourscheme_temp[setdiff(names(colourscheme_temp),"driver")]
df_cor_R_all <- df_cor_R_all[df_cor_R_all$variant_class!="driver",]
}else{
colourscheme_temp <-colourscheme_temp[setdiff(names(colourscheme_temp),"drug addiction")]
}
median_PFS_outcome_score_per_variant_class <- df_cor_R_all[,c("PFS_outcome_score","variant_class")] %>% group_by(variant_class)%>% summarize(median_PFS_outcome_score=median(PFS_outcome_score))
median_PFS_outcome_score_per_variant_class <- median_PFS_outcome_score_per_variant_class[order(median_PFS_outcome_score_per_variant_class$median_PFS_outcome_score),]
df_cor_R_all$variant_class <- factor(df_cor_R_all$variant_class,levels=median_PFS_outcome_score_per_variant_class$variant_class)
ggplot(df_cor_R_all,aes(y=PFS_outcome_score,x=variant_class,color=variant_class)) + geom_boxplot() + scale_color_manual(values=colourscheme_temp)+theme_classic(base_size=14)+xlab("")+theme(
axis.text.x = element_blank())
We test for significance of difference of PSF outcome scores across variant classes.
wilcox_PFS_outcome_score <- pairwise.wilcox.test(df_cor_R_all$PFS_outcome_score,g=df_cor_R_all$variant_class,p.adjust.method = "BH")
wilcox_PFS_outcome_score_p_val <- reshape2::melt(signif(wilcox_PFS_outcome_score$p.value,2))
names(wilcox_PFS_outcome_score_p_val) <- c("variant_class_1","variant_class_2","FDR")
if (yy=="ABE"){
wilcox_PFS_outcome_score_p_val$variant_class_1 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_1,levels=c("non_targeting_control","control","canonical drug resistance","drug addiction"))
wilcox_PFS_outcome_score_p_val$variant_class_2 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_2,levels=c("non_targeting_control","control","canonical drug resistance","drug addiction"))
}else{
wilcox_PFS_outcome_score_p_val$variant_class_1 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_1,levels=c("non_targeting_control","control","canonical drug resistance","driver"))
wilcox_PFS_outcome_score_p_val$variant_class_2 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_2,levels=c("non_targeting_control","control","canonical drug resistance","driver"))
}
ggplot(wilcox_PFS_outcome_score_p_val,aes(x=variant_class_1,y=variant_class_2,fill=-log10(FDR),label=FDR)) + geom_raster()+geom_text(size=5,color="white")+theme_classic(base_size=14)+xlab("")+ylab("")+scale_fill_continuous_tableau(na.value="white",palette="Red-Gold")+ggtitle(paste0("p-values: PFS outcome score: ",yy))
We add the average diffusion score (for drug addiction(ABE)/driver(CBE)) and PSF outcome score to the validation library, and saving the annotated validation library.
validation_lib$diffusion_score <- NA
for (j in 1:length(gRNAs_resistance_driver))
{validation_lib$diffusion_score[validation_lib$ID==gRNAs_resistance_driver[j]] <-
mean(sce_iBAR_resistance_one_gRNA$diffusion_score[grepl(gRNAs_resistance_driver[j],sce_iBAR_resistance_one_gRNA$gRNA)],na.rm=TRUE)}
validation_lib$PFS_outcome_score <- NA
validation_lib$PFS_outcome_score[match(df_cor_R_all$gRNA,validation_lib$ID)] <- df_cor_R_all$PFS_outcome_score
write.table(validation_lib,file=paste0("validation_lib_ann_",yy,".csv"),sep=",",col.names=TRUE,row.names=FALSE)
We add additional meta-data to the sce_iBAR SingleCellExperimeriment object and saving it.
sce_iBAR$diffusion_score <- NA
sce_iBAR[,colnames(sce_iBAR_resistance_one_gRNA)]$diffusion_score <- sce_iBAR_resistance_one_gRNA$diffusion_score
saveRDS(sce_iBAR,file=sce_save_file)